@InProceedings{DuartePenaAlme:2018:BaAtVi,
author = "Duarte, Leonardo Assuane and Penatti, Ot{\'a}vio Augusto Bizetto
and Almeida, Jurandy",
affiliation = "{Universidade Federal de S{\~a}o Paulo - UNIFESP} and {SAMSUNG
Research Institute} and {Universidade Federal de S{\~a}o Paulo -
UNIFESP}",
title = "Bag of attributes for video event retrieval",
booktitle = "Proceedings...",
year = "2018",
editor = "Ross, Arun and Gastal, Eduardo S. L. and Jorge, Joaquim A. and
Queiroz, Ricardo L. de and Minetto, Rodrigo and Sarkar, Sudeep and
Papa, Jo{\~a}o Paulo and Oliveira, Manuel M. and Arbel{\'a}ez,
Pablo and Mery, Domingo and Oliveira, Maria Cristina Ferreira de
and Spina, Thiago Vallin and Mendes, Caroline Mazetto and Costa,
Henrique S{\'e}rgio Gutierrez and Mejail, Marta Estela and Geus,
Klaus de and Scheer, Sergio",
organization = "Conference on Graphics, Patterns and Images, 31. (SIBGRAPI)",
publisher = "IEEE Computer Society",
address = "Los Alamitos",
keywords = "video event retrieval, video representation, visual dictionaries,
semantics.",
abstract = "In this paper, we present the Bag-of-Attributes (BoA) model for
video representation aiming at video event retrieval. The BoA
model is based on a semantic feature space for representing
videos, resulting in high-level video feature vectors. For
creating a semantic space, i.e., the attribute space, we can train
a classifier using a labeled image dataset, obtaining a
classification model that can be understood as a high-level
codebook. This model is used to map low-level frame vectors into
high-level vectors (e.g., classifier probability scores). Then, we
apply pooling operations to the frame vectors to create the final
bag of attributes for the video. In the BoA representation, each
dimension corresponds to one category (or attribute) of the
semantic space. Other interesting properties are: compactness,
flexibility regarding the classifier, and ability to encode
multiple semantic concepts in a single video representation. Our
experiments considered the semantic space created by
state-of-the-art convolutional neural networks pre-trained on 1000
object categories of ImageNet. Such deep neural networks were used
to classify each video frame and then different coding strategies
were used to encode the probability distribution from the softmax
layer into a frame vector. Next, different pooling strategies were
used to combine frame vectors in the BoA representation for a
video. Results using BoA were comparable or superior to the
baselines in the task of video event retrieval using the EVVE
dataset, with the advantage of providing a much more compact
representation.",
conference-location = "Foz do Igua{\c{c}}u, PR, Brazil",
conference-year = "29 Oct.-1 Nov. 2018",
doi = "10.1109/SIBGRAPI.2018.00064",
url = "http://dx.doi.org/10.1109/SIBGRAPI.2018.00064",
language = "en",
ibi = "8JMKD3MGPAW/3RN82FP",
url = "http://urlib.net/ibi/8JMKD3MGPAW/3RN82FP",
targetfile = "59paper.pdf",
urlaccessdate = "2024, Apr. 29"
}